1 API 概览 && 编码Tips

1.1 文档地址

1.2 常用API

  • Network 网络请求、Cookie、缓存、证书等相关内容
  • Page 页面的加载、资源内容、弹层、截图、打印等相关内容
  • DOM 文档DOM的获取、修改、删除、查询等相关内容
  • Runtime JavaScript代码的执行,这里面我们可以搞事情~~

1.3 编码Tips

  • 我们这里不会直接调用Websocket相关的内容来调用chrome的调试命令,而是用chrome-remote-interface 这个封装的库来做,它是基于Promise风格的
  • 每一个功能块成为一个单独的domain,像Network,Page,DOM等都是不同的domain
  • 几乎每一个个头大的domain都有enable方法,需要先调用这个方法启用之后再使用
  • 各个domain的接口方法参数都是第一个对象或者说一个Map,不用考虑参数的位置了
  • 各个domain的接口返回值也是一个对象,取对应的key就行
  • 参数值和返回值经常是meta信息,经常是各种对象的id信息,而不是具体的对象内容(这里可能需要切一下风格)

2 编码实例

首先做一个简单的封装,准备API的执行环境,具体可参考前一篇关于工具库的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
const chromeLauncher = require('chrome-launcher');
const chromeRemoteInterface = require('chrome-remote-interface');

const prepareAPI = (config = {}) => {
const {host = 'localhost', port = 9222, autoSelectChrome = true, headless = true} = config;
const wrapperEntry = chromeLauncher.launch({
host,
port,
autoSelectChrome,
additionalFlags: [
'--disable-gpu',
headless ? '--headless' : ''
]
}).then(chromeInstance => {
const remoteInterface = chromeRemoteInterface(config).then(chromeAPI => chromeAPI).catch(err => {
throw err;
});
return Promise.all([chromeInstance, remoteInterface])
}).catch(err => {
throw err
});

return wrapperEntry
};

2.1 打开百度,获取页面性能数据,参考 Navigation Timing W3C规范

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
const wrapper = require('the-wrapper-module');

const performanceParser = (perforceTiming) => {
let timingGather = {};
perforceTiming = perforceTiming || {};
timingGather.redirect = perforceTiming.redirectEnd - perforceTiming.redirectEnd-perforceTiming.redirectStart;
timingGather.dns = perforceTiming.domainLookupEnd - perforceTiming.domainLookupStart;
timingGather.tcp = perforceTiming.connectEnd - perforceTiming.connectStart;
timingGather.request = perforceTiming.responseStart - perforceTiming.requestStart;
timingGather.response = perforceTiming.responseEnd - perforceTiming.responseStart;
timingGather.domReady = perforceTiming.domContentLoadedEventStart - perforceTiming.navigationStart;
timingGather.load = perforceTiming.loadEventStart - perforceTiming.navigationStart;
return timingGather;
};

const showPerformanceInfo = (performanceInfo) => {
performanceInfo = performanceInfo || {};
console.log(`页面重定向耗时:${performanceInfo.redirect}`);
console.log(`DNS查找耗时:${performanceInfo.dns}`);
console.log(`TCP连接耗时:${performanceInfo.tcp}`);
console.log(`请求发送耗时:${performanceInfo.request}`);
console.log(`响应接收耗时:${performanceInfo.response}`);
console.log(`DOMReady耗时:${performanceInfo.domReady}`);
console.log(`页面加载耗时:${performanceInfo.load}`);
};

wrapper.prepareAPI().then(([chromeInstance, remoteInterface]) => {
const {Runtime,Page} = remoteInterface;

Page.loadEventFired(() => {
Runtime.evaluate({
expression:'window.performance.timing.toJSON()',
returnByValue:true //不加这个参数,拿到的是一个对象的meta信息,还需要getProperties
}).then((resultObj) => {
let {result,exceptionDetails} = resultObj;
if(!exceptionDetails){
showPerformanceInfo(performanceParser(result.value))
}else{
throw exceptionDetails;
}
});
});

Page.enable().then(() => {
Page.navigate({
url:'http://www.baidu.com'
})
});
});

2.2 打开百度 搜索Web自动化 headless chrome,并爬取首屏结果链接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
const wrapper = require('the-wrapper-module');
//有this的地方写成箭头函数要注意,这里会有问题
const buttonClick = function () {
this.click();
};

const setInputValue = () => {
var input = document.getElementById('kw');
input.value = 'Web自动化 headless chrome';
};

const parseSearchResult = () => {
let resultList = [];
const linkBlocks = document.querySelectorAll('div.result.c-container');
for (let block of Array.from(linkBlocks)) {
let targetObj = block.querySelector('h3');
resultList.push({
title: targetObj.textContent,
link: targetObj.querySelector('a').getAttribute('href')
});
}
return resultList;
};


wrapper.prepareAPI({
// headless: false //加上这行代码可以查看浏览器的变化
}).then(([chromeInstance, remoteInterface]) => {
const {Runtime, DOM, Page, Network} = remoteInterface;
let framePointer;
Promise.all([Page.enable(), Network.enable(), DOM.enable(),Page.setAutoAttachToCreatedPages({autoAttach:true})]).then(() => {
Page.domContentEventFired(() => {
console.log('Page.domContentEventFired')
Runtime.evaluate({
expression:`window.location.href`,
returnByValue:true
}).then(result => {
console.log(result)
})
});
Page.frameNavigated(() => {
console.log('Page.frameNavigated')
Runtime.evaluate({
expression:`window.location.href`,
returnByValue:true
}).then(result => {
console.log(result)
})
})
Page.loadEventFired(() => {
console.log('Page.loadEventFired')
Runtime.evaluate({
expression:`window.location.href`,
returnByValue:true
}).then(result => {
console.log(result)
})
DOM.getDocument().then(({root}) => {
//百度首页表单
DOM.querySelector({
nodeId: root.nodeId,
selector: '#form'
}).then(({nodeId}) => {
Promise.all([
//找到 搜索框填入值
DOM.querySelector({
nodeId: nodeId,
selector: '#kw'
}).then((inputNode) => {

Runtime.evaluate({
// 两种写法
// expression:'document.getElementById("kw").value = "Web自动化 headless chrome"',
expression: `(${setInputValue})()`
});


//这段代码不起作用 日狗
// DOM.setNodeValue({
// nodeId:inputNode.nodeId,
// value:'Web自动化 headless chrome'
// });

//上面的代码需求要这么写
// DOM.setAttributeValue({
// nodeId:inputNode.nodeId,
// name:'value',
// value:'headless chrome'
// });
})
//找到 提交按钮setInputValue
, DOM.querySelector({
nodeId,
selector: '#su'
})
]).then(([inputNode, buttonNode]) => {

Runtime.evaluate({
expression: 'document.getElementById("kw").value',
}).then(({result}) => {
console.log(result)
});

return DOM.resolveNode({
nodeId: buttonNode.nodeId
}).then(({object}) => {
const {objectId} = object;
return Runtime.callFunctionOn({
objectId,
functionDeclaration: `${buttonClick}`
})
});
}).then(() => {
setTimeout(() => {
Runtime.evaluate({
expression: `(${parseSearchResult})()`,
returnByValue: true
}).then(({result}) => {
console.log(result.value)
//百度的URL有加密,需要再请求一次拿到真实URL
})
},3e3)
});
})

});
});
Page.navigate({
url: 'http://www.baidu.com'
}).then((frameObj) => {
framePointer = frameObj
});
})

});

最后更新: 2022年03月02日 03:32

原始链接: http://rawbin-.github.io/automatic/2017-06-12-headless-chrome-demo/

× 赞赏这个人~
打赏二维码